
# This file mainly presents how to get the descriptive analysis result and Pathway 1's related analysis results

# Relevant tables in paper include:

  # Descriptive analysis: Table 2

  # Pathway 1: Table 3: Logistic Regression Results (Pathway 1)
             # Table 4: Predictive Accuracy Results (Pathway 1)
             # Table 8: Variance Inflation Factor Values of Variables (Pathway 1 part)
             # Table 9: Models’ performance metrics (Pathway 1 part)

### R version: 4.4.2
#-----------------Prepare the libraries and the dataset------------------------#

rm(list = ls())

library(readxl)  
library(tidyverse)  
library(fixest)        
library(lme4)          
library(pROC)          
library(ResourceSelection) 
library(car)
library(knitr)

options(scipen = 100)  

# Input the dataset

PSIdata <- read_excel("~/Desktop/PSI.xlsx")

#—————————————————————-—————Table 2:Descriptive analysis————————————-———————-——#

# Table 2:Descriptive analysis

desc_stats <- PSIdata %>%
  select(win, innovation, impact, replicability,wordcount,numeric,
         subjectivity,emotionality,certainty,Year, Country) %>%
  summarise(across(everything(), 
                   list(N = ~length(.),
                        Mean = ~round(mean(., na.rm = TRUE), 3),
                        SD = ~round(sd(., na.rm = TRUE), 3),
                        Min = ~round(min(., na.rm = TRUE), 3),
                        Max = ~round(max(., na.rm = TRUE), 3)))) %>%
  pivot_longer(everything(), 
               names_to = c("Variable", ".value"), 
               names_pattern = "(.*)_(.*)")

print(desc_stats)

#———————————————Table 3: Logistic Regression Result (Pathway 1)—————————————---#

#1) Select the UNPSA data------------------------------------------------
# text_type==0 means UNPSA text; text_type==1 means synthetic (AI-generated) text

PSIdata1 <- subset(PSIdata, text_type==0)  
PSIdata1 <- PSIdata1 %>%  
  mutate(  
    Country = as.factor(Country),  
    Year = as.factor(as.character(Year))
  )  

# Create train/test splits 

train_data1 <- PSIdata1 %>% filter(Year == "2018")  
test_data1  <- PSIdata1 %>% filter(Year %in% c("2019", "2020", "2021", "2022"))  

train_data2 <- PSIdata1 %>% filter(Year %in% c("2018", "2019"))  
test_data2  <- PSIdata1 %>% filter(Year %in% c("2020", "2021", "2022"))  

train_data3 <- PSIdata1 %>% filter(Year %in% c("2018", "2019", "2020"))  
test_data3  <- PSIdata1 %>% filter(Year %in% c("2021", "2022"))  

train_data4 <- PSIdata1 %>% filter(Year %in% c("2018", "2019", "2020", "2021"))  
test_data4  <- PSIdata1 %>% filter(Year == "2022")  

# identify predictor vector 

predictors <- c("innovation", "impact", "replicability",  
                "subjectivity", "numeric", "emotionality", "certainty","wordcount")  
pred_formula_rhs <- paste(predictors, collapse = " + ")  

# 3) Regression models------------------------------------------------

fit_glmer <- function(train_df, model_name) {  
  re_terms <- c()  
  if (n_distinct(train_df$Country) > 1) re_terms <- c(re_terms, "(1|Country)")  
  if (n_distinct(train_df$Year) > 1)    re_terms <- c(re_terms, "(1|Year)")  
  re_part <- if (length(re_terms) > 0) paste("+", paste(re_terms, collapse = " + ")) else ""  
  fmla <- as.formula(paste0("win ~ ", pred_formula_rhs, " ", re_part))  
  message("Fitting GLMM for ", model_name, ": ", deparse(fmla))  
  glmer(fmla, data = train_df, family = binomial(link = "logit"),  
        control = glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5)))  
}  

# Fit all four GLMM models
model1 <- fit_glmer(train_data1, "Model 1")  
model2 <- fit_glmer(train_data2, "Model 2")  
model3 <- fit_glmer(train_data3, "Model 3")  
model4 <- fit_glmer(train_data4, "Model 4")  

# Extract coefficients from GLMM models
extract_glmm_coefficients <- function(model, model_name) {
  coef_summary <- summary(model)$coefficients
  coef_df <- as.data.frame(coef_summary)
  coef_df$Variable <- rownames(coef_df)
  coef_df$Model <- model_name
  coef_df <- coef_df[, c("Model", "Variable", "Estimate", "z value", "Pr(>|z|)")]
  names(coef_df) <- c("Model", "Variable", "Coefficient", "z_value", "p_value")
  
  coef_df$Significance <- ifelse(coef_df$p_value < 0.001, "***",
                                 ifelse(coef_df$p_value < 0.01, "**",
                                        ifelse(coef_df$p_value < 0.05, "*", "")))
  return(coef_df)
}

# 4) Output results-------------------------------------------------------------

# Model 1 regression results---------------------------------------------------- 
coef_model1 <- extract_glmm_coefficients(model1, "Model 1 (Train: 2018)")
kable(coef_model1[, c("Variable", "Coefficient", "z_value", "p_value", "Significance")],
      digits = 3,
      row.names = FALSE,
      caption = "MODEL 1 (UNPSA Winner of 2019-2022)")
cat("N (Train) =", nrow(train_data1), "observations\n")
cat("N (Test) =", nrow(test_data1), "observations\n\n")

# Model 2 regression results----------------------------------------------------
coef_model2 <- extract_glmm_coefficients(model2, "Model 2 (Train: 2018-2019)")
kable(coef_model2[, c("Variable", "Coefficient", "z_value", "p_value", "Significance")],
      digits = 3,
      row.names = FALSE,
      caption = "MODEL 2 (UNPSA Winner of 2020-2022)")
cat("N (Train) =", nrow(train_data2), "observations\n")
cat("N (Test) =", nrow(test_data2), "observations\n\n")


# Model 3 regression results---------------------------------------------------- 
coef_model3 <- extract_glmm_coefficients(model3, "Model 3 (Train: 2018-2020)")
kable(coef_model3[, c("Variable", "Coefficient", "z_value", "p_value", "Significance")],
      digits = 3,
      row.names = FALSE,
      caption = "MODEL 3 (UNPSA Winner of 2021-2022)")
cat("N (Train) =", nrow(train_data3), "observations\n")
cat("N (Test) =", nrow(test_data3), "observations\n\n")


# Model 4 regression results---------------------------------------------------- 
coef_model4 <- extract_glmm_coefficients(model4, "Model 4 (Train: 2018-2021)")
kable(coef_model4[, c("Variable", "Coefficient", "z_value", "p_value", "Significance")],
      digits = 3,
      row.names = FALSE,
      caption = "MODEL 4 (UNPSA Winner of 2022)")
cat("N (Train) =", nrow(train_data4), "observations\n")
cat("N (Test) =", nrow(test_data4), "observations\n\n")



#-----------Table 4:Predictive Accuracy Results (Pathway 1)--------------------#

# 1) calculate the model performance metrics------------------------------------

calculate_pr_auc <- function(actual, probs) {
  if (length(actual) != length(probs)) {
    stop("Not match")
  }
  
  # remove NA value
  valid_idx <- !is.na(actual) & !is.na(probs)
  actual <- actual[valid_idx]
  probs <- probs[valid_idx]
  
  if (sum(actual) == 0) {
    return(0)
  }
  
  # calculate Precision-Recall curve
  pr_curve <- data.frame(
    threshold = seq(0, 1, by = 0.01),
    precision = NA,
    recall = NA
  )
  
  for (i in 1:nrow(pr_curve)) {
    threshold <- pr_curve$threshold[i]
    pred_class <- ifelse(probs >= threshold, 1, 0)
    
    # Calculate the confusion matrix
    tp <- sum(actual == 1 & pred_class == 1)
    tn <- sum(actual == 0 & pred_class == 0)
    fp <- sum(actual == 0 & pred_class == 1)
    fn <- sum(actual == 1 & pred_class == 0)
    
    # calculate Precision and Recall
    precision <- ifelse((tp + fp) > 0, tp / (tp + fp), 1)
    recall <- ifelse((tp + fn) > 0, tp / (tp + fn), 0)
    
    pr_curve$precision[i] <- precision
    pr_curve$recall[i] <- recall
  }
  
  pr_curve <- pr_curve[order(pr_curve$recall, -pr_curve$precision), ]
  pr_curve <- pr_curve[!duplicated(pr_curve$recall), ]
  
  # calculate the PR AUC 
  pr_auc <- 0
  for (i in 2:nrow(pr_curve)) {
    width <- pr_curve$recall[i] - pr_curve$recall[i-1]
    height <- (pr_curve$precision[i] + pr_curve$precision[i-1]) / 2
    pr_auc <- pr_auc + width * height
  }
  
  return(pr_auc)
}

calculate_metrics <- function(actual, predicted, probs) {
  conf_matrix <- table(actual, predicted)
  accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
  recall <- conf_matrix[2,2] / sum(conf_matrix[2,])
  specificity <- conf_matrix[1,1] / sum(conf_matrix[1,])
  precision <- conf_matrix[2,2] / sum(conf_matrix[,2])
  f1_score <- 2 * (precision * recall) / (precision + recall)
  
  # ROC AUC
  roc_obj <- roc(actual, probs)
  auc_val <- auc(roc_obj)
  
  # PR AUC
  pr_auc_val <- calculate_pr_auc(actual, probs)
  
  return(list(
    confusion_matrix = conf_matrix,
    accuracy = accuracy,
    recall = recall,
    specificity = specificity,
    precision = precision,
    f1_score = f1_score,
    auc = auc_val,
    pr_auc = pr_auc_val
  ))
}

pred_prob1 <- predict(model1, newdata = test_data1, type = "response", allow.new.levels = TRUE)  
pred_prob2 <- predict(model2, newdata = test_data2, type = "response", allow.new.levels = TRUE)  
pred_prob3 <- predict(model3, newdata = test_data3, type = "response", allow.new.levels = TRUE)  
pred_prob4 <- predict(model4, newdata = test_data4, type = "response", allow.new.levels = TRUE)  

pred_class1 <- ifelse(pred_prob1 >= 0.5, 1, 0)  
pred_class2 <- ifelse(pred_prob2 >= 0.5, 1, 0)  
pred_class3 <- ifelse(pred_prob3 >= 0.5, 1, 0)  
pred_class4 <- ifelse(pred_prob4 >= 0.5, 1, 0)  

# 2) output the predictive accuracy results (Table 4)---------------------------

metrics1 <- calculate_metrics(test_data1$win, pred_class1, pred_prob1)
metrics2 <- calculate_metrics(test_data2$win, pred_class2, pred_prob2)
metrics3 <- calculate_metrics(test_data3$win, pred_class3, pred_prob3)
metrics4 <- calculate_metrics(test_data4$win, pred_class4, pred_prob4)

# Predictive Accuracy Results of Model 1----------------------------------------
print(metrics1) 

# Predictive Accuracy Results of Model 2----------------------------------------
print(metrics2) 

# Predictive Accuracy Results of Model 3----------------------------------------
print(metrics3) 

# Predictive Accuracy Results of Model 4----------------------------------------
print(metrics4) 


#---------Table 8: Variance Inflation Factor Values of Variables--------------#
# Models 1, 2, 3, 4 part

round(vif(model1),3)
round(vif(model2),3)
round(vif(model3),3)
round(vif(model4),3)


#-------------------------Table 9: Models’ performance metrics----------------#
# Models 1, 2, 3, 4 part

# 1) construct the performance table--------------------------------------------

performance_table <- data.frame(
  Model = c("Model 1", "Model 2", "Model 3", "Model 4"),
  Precision = c(metrics1$precision, metrics2$precision, metrics3$precision, metrics4$precision),
  Recall = c(metrics1$recall, metrics2$recall, metrics3$recall, metrics4$recall),
  F1_Score = c(metrics1$f1_score, metrics2$f1_score, metrics3$f1_score, metrics4$f1_score),
  PR_AUC = c(metrics1$pr_auc, metrics2$pr_auc, metrics3$pr_auc, metrics4$pr_auc)
)

performance_table[, -1] <- round(performance_table[, -1], 3)

# 2) Output the performance results---------------------------------------------

cat("\nPerformance Metrics Table:\n")
print(performance_table, row.names = FALSE)



